package no.trank.openpipe.reader;
import com.google.common.base.Throwables;
import com.google.common.collect.AbstractIterator;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.DocumentProducer;
import no.trank.openpipe.api.document.DomRawData;
import no.trank.openpipe.config.annotation.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.Resource;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import javax.xml.transform.dom.DOMResult;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
/**
* Streams an arbitrarily large XML file while pulling out each sub-tree rooted at a matching element name. The
* DOM is put into a {@link DomRawData}.
*
* @author David Smiley - dsmiley@mitre.org
*/
public class MultiXmlDocumentReader implements DocumentProducer
{
private final Logger log = LoggerFactory.getLogger(getClass());
@NotNull
private Resource input;
private InputStream inputStream;//fetched lazily from input once needed; closed in close().
private XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
private XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
private DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
@NotNull
private QName elemMatch;
@Override
public void init() {
}
@Override
public void close() {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
log.warn(e.toString(),e);
}
}
}
@Override
public void fail() {
close();
}
@Override
public Iterator<Document> iterator() {
if (inputStream != null) {
throw new IllegalStateException("already fetched inputStream!");
}
return new MultiXmlDocumentIterator();
}
public void setInput(Resource input) {
this.input = input;
}
public void setElemMatch(QName elemMatch) {
this.elemMatch = elemMatch;
}
class MultiXmlDocumentIterator extends AbstractIterator<Document> {
private XMLEventReader eventReader;
private StartElement currStartEle;
@Override
protected Document computeNext() {
try {
if (currStartEle == null)
currStartEle = readTillMatchingStartEle();
if (currStartEle == null) {
return endOfData();
} else {
Document doc = readAndBuildDocument();
currStartEle = null;
return doc;
}
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
private StartElement readTillMatchingStartEle() throws Exception {
assert currStartEle == null;
if (eventReader == null) {
inputStream = input.getInputStream();
eventReader = xmlInputFactory.createXMLEventReader(inputStream);
}
while(eventReader.hasNext()) {
XMLEvent evt = eventReader.nextEvent();
if (evt.isStartElement()) {
StartElement ele = evt.asStartElement();
if (ele.getName().equals(elemMatch)) {
return ele;
}
}
}
return null;
}
private Document readAndBuildDocument() throws Exception {
assert currStartEle != null;
DocumentBuilder docBuilder = documentBuilderFactory.newDocumentBuilder();
DOMResult domResult = new DOMResult(docBuilder.newDocument());
XMLEventWriter eventWriter = xmlOutputFactory.createXMLEventWriter(domResult);
int depth = 1;
eventWriter.add(currStartEle);
while(eventReader.hasNext() && depth > 0) {
XMLEvent evt = eventReader.nextEvent();
eventWriter.add(evt);
if (evt.isStartElement()) {
depth++;
} else if (evt.isEndElement()) {
depth--;
}
}
return new Document(new DomRawData(null,domResult.getNode()));
}
}
}